Load the required packages:
library(dplyr)
library(Seurat)
library(patchwork)
library(devtools)
The data were taken from PANGLAODB. It a sample of mammary tissue (mam) from mus musculus that is part of the Tabula Muris project.
load("SRA653146_SRS3044257.sparse.RData")
# retain only the official gene symbol
rownames(sm) <- sapply(strsplit(rownames(sm),"_"), `[`, 1)
#create the Seurat object
mam <- CreateSeuratObject(counts = sm, project = "scMammary", min.cells = 3, min.features = 200)
## Warning: Non-unique features (rownames) present in the input matrix, making
## unique
mam
## An object of class Seurat
## 21839 features across 3454 samples within 1 assay
## Active assay: RNA (21839 features, 0 variable features)
Is a check of the principal control parameters: number of unique genes detected in each cell (called “Features”), the total number of detected molecules (“Count”) and the percentage of reads that map to the mitochondrial genome.
Check the presence of mithocondrial genes (mt-…) Low-quality / dying cells often exhibit extensive mitochondrial contamination.
grep("^mt-",rownames(mam),value = TRUE)
## [1] "mt-Atp6" "mt-Co1" "mt-Co2" "mt-Co3" "mt-Cytb" "mt-Nd1" "mt-Nd2"
## [8] "mt-Nd3" "mt-Nd4" "mt-Nd4l" "mt-Nd5" "mt-Nd6" "mt-Rnr1" "mt-Rnr2"
## [15] "mt-Tc" "mt-Tn" "mt-Tp"
# add a colun with the stats
mam[["percent.mt"]] <- PercentageFeatureSet(mam, pattern = "^mt-")
Check the ribosomal protein genes (Rpl/Rps…)
grep("^Rp[ls]",rownames(mam),value = TRUE)
## [1] "Rpl10" "Rpl10-ps1" "Rpl10-ps2" "Rpl10-ps4" "Rpl10-ps6"
## [6] "Rpl10a" "Rpl10a-ps1" "Rpl11" "Rpl12" "Rpl12-ps1"
## [11] "Rpl12-ps2" "Rpl13" "Rpl13-ps1" "Rpl13-ps5" "Rpl13-ps6"
## [16] "Rpl13a" "Rpl13a-ps1" "Rpl14" "Rpl14-ps1" "Rpl15"
## [21] "Rpl15-ps1" "Rpl15-ps2" "Rpl15-ps3" "Rpl17" "Rpl17-ps1"
## [26] "Rpl17-ps10" "Rpl17-ps3" "Rpl17-ps4" "Rpl17-ps5" "Rpl17-ps8"
## [31] "Rpl17-ps9" "Rpl18" "Rpl18-ps1" "Rpl18-ps2" "Rpl18a"
## [36] "Rpl18a-ps1" "Rpl19" "Rpl19-ps1" "Rpl19-ps11" "Rpl19-ps12"
## [41] "Rpl19-ps2" "Rpl21" "Rpl21-ps1" "Rpl21-ps10" "Rpl21-ps11"
## [46] "Rpl21-ps15" "Rpl21-ps3" "Rpl21-ps5" "Rpl21-ps6" "Rpl21-ps7"
## [51] "Rpl21-ps8" "Rpl21-ps9" "Rpl22" "Rpl22-ps1" "Rpl22l1"
## [56] "Rpl23" "Rpl23a" "Rpl23a-ps1" "Rpl23a-ps14" "Rpl23a-ps2"
## [61] "Rpl23a-ps3" "Rpl23a-ps5" "Rpl24" "Rpl26" "Rpl26-ps4"
## [66] "Rpl27" "Rpl27-ps1" "Rpl27-ps2" "Rpl27-ps3" "Rpl27a"
## [71] "Rpl27a-ps1" "Rpl27a-ps2" "Rpl28" "Rpl28-ps1" "Rpl28-ps3"
## [76] "Rpl29" "Rpl29-ps5" "Rpl3" "Rpl3-ps1" "Rpl3-ps2"
## [81] "Rpl30" "Rpl30-ps1" "Rpl30-ps10" "Rpl30-ps11" "Rpl30-ps3"
## [86] "Rpl30-ps5" "Rpl30-ps9" "Rpl31" "Rpl31-ps1" "Rpl31-ps10"
## [91] "Rpl31-ps11" "Rpl31-ps13" "Rpl31-ps14" "Rpl31-ps15" "Rpl31-ps16"
## [96] "Rpl31-ps17" "Rpl31-ps18" "Rpl31-ps20" "Rpl31-ps6" "Rpl31-ps8"
## [101] "Rpl31-ps9" "Rpl32" "Rpl32-ps" "Rpl34" "Rpl34-ps1"
## [106] "Rpl34-ps2" "Rpl35" "Rpl35a" "Rpl35a-ps2" "Rpl35a-ps3"
## [111] "Rpl35a-ps4" "Rpl35a-ps5" "Rpl35a-ps6" "Rpl35a-ps7" "Rpl36"
## [116] "Rpl36-ps2" "Rpl36-ps3" "Rpl36-ps8" "Rpl36-ps9" "Rpl36a"
## [121] "Rpl36a-ps2" "Rpl36a-ps3" "Rpl36a-ps5" "Rpl36al" "Rpl37"
## [126] "Rpl37a" "Rpl37rt" "Rpl38" "Rpl38-ps1" "Rpl38-ps2"
## [131] "Rpl39" "Rpl39-ps" "Rpl39l" "Rpl4" "Rpl41"
## [136] "Rpl5" "Rpl5-ps1" "Rpl5-ps2" "Rpl6" "Rpl7"
## [141] "Rpl7-ps7" "Rpl7a" "Rpl7a-ps1" "Rpl7a-ps10" "Rpl7a-ps11"
## [146] "Rpl7a-ps12" "Rpl7a-ps14" "Rpl7a-ps3" "Rpl7a-ps5" "Rpl7a-ps7"
## [151] "Rpl7a-ps8" "Rpl7l1" "Rpl8" "Rpl9" "Rpl9-ps4"
## [156] "Rpl9-ps6" "Rpl9-ps7" "Rplp0" "Rplp1" "Rplp1-ps1"
## [161] "Rplp2" "Rps10" "Rps10-ps1" "Rps10-ps2" "Rps10-ps4"
## [166] "Rps11" "Rps11-ps1" "Rps11-ps2" "Rps11-ps3" "Rps11-ps4"
## [171] "Rps12" "Rps12-ps1" "Rps12-ps10" "Rps12-ps13" "Rps12-ps19"
## [176] "Rps12-ps20" "Rps12-ps24" "Rps12-ps3" "Rps12-ps5" "Rps12-ps9"
## [181] "Rps13" "Rps13-ps1" "Rps13-ps2" "Rps13-ps4" "Rps13-ps5"
## [186] "Rps14" "Rps15" "Rps15-ps2" "Rps15a" "Rps15a-ps1"
## [191] "Rps15a-ps2" "Rps15a-ps3" "Rps15a-ps4" "Rps15a-ps5" "Rps15a-ps6"
## [196] "Rps15a-ps7" "Rps15a-ps8" "Rps16" "Rps16-ps2" "Rps17"
## [201] "Rps18" "Rps18-ps1" "Rps19" "Rps19-ps1" "Rps19-ps10"
## [206] "Rps19-ps11" "Rps19-ps12" "Rps19-ps13" "Rps19-ps2" "Rps19-ps3"
## [211] "Rps19-ps4" "Rps19-ps5" "Rps19-ps6" "Rps19-ps7" "Rps19-ps9"
## [216] "Rps19bp1" "Rps2" "Rps2-ps10" "Rps2-ps11" "Rps2-ps13"
## [221] "Rps2-ps2" "Rps2-ps4" "Rps2-ps5" "Rps2-ps9" "Rps20"
## [226] "Rps21" "Rps23" "Rps23-ps1" "Rps23-ps2" "Rps24"
## [231] "Rps24-ps2" "Rps24-ps3" "Rps25" "Rps25-ps1" "Rps26"
## [236] "Rps26-ps1" "Rps27" "Rps27-ps1" "Rps27a" "Rps27a-ps1"
## [241] "Rps27a-ps2" "Rps27l" "Rps27rt" "Rps28" "Rps29"
## [246] "Rps3" "Rps3a1" "Rps3a2" "Rps3a3" "Rps4l"
## [251] "Rps4x" "Rps4x-ps" "Rps5" "Rps6" "Rps6-ps4"
## [256] "Rps6ka1" "Rps6ka2" "Rps6ka3" "Rps6ka4" "Rps6ka5"
## [261] "Rps6ka6" "Rps6kb1" "Rps6kb2" "Rps6kc1" "Rps6kl1"
## [266] "Rps7" "Rps8" "Rps8-ps2" "Rps8-ps4" "Rps9"
## [271] "Rpsa" "Rpsa-ps1" "Rpsa-ps10" "Rpsa-ps11" "Rpsa-ps12"
## [276] "Rpsa-ps3" "Rpsa-ps4" "Rpsa-ps5" "Rpsa-ps9"
# add a column with the stats
mam[["percent.rbp"]] <- PercentageFeatureSet(mam, pattern = "^Rp[ls]")
# Feature = # genes
# Count = # detected molecules
VlnPlot(mam, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.rbp"), ncol = 4, cols = "mediumpurple1")
# plot without dots:
VlnPlot(mam, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.rbp"), ncol = 4, pt.size=0, cols = "mediumpurple1")
Then I check if there is a correlation between the different parameters with FeatureScatter plots:
Correlation between % of mitochondrial RNA and number of reads
Correlation between number of genes and number of reads
Correlation between % of rRNA and number of reads
FeatureScatter(mam, feature1 = "nCount_RNA", feature2 = "percent.mt")
FeatureScatter(mam, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
FeatureScatter(mam, feature1 = "nCount_RNA", feature2 = "percent.rbp")
Low-quality cells or empty droplets will often have very few genes, while cell doublets or multiplets exhibit an aberrantly high gene count. The total number of molecules detected within a cell correlates strongly with unique genes, is the only visible correlation.
From these plots I decide a threshold for the number of genes: between 200 and 4000. The mithocondrial percentage should be lower than 5%.
mam <- subset(mam, subset = nFeature_RNA > 200 & nFeature_RNA < 4000 & percent.mt < 5)
mam
## An object of class Seurat
## 21839 features across 3134 samples within 1 assay
## Active assay: RNA (21839 features, 0 variable features)
The remaining samples are 3134 (from 3454 of before)
10x data are usually just transformed into counts per million (or counts x 10,000 reads) The final “expression estimate” it’s given by the log of the normalized counts:
mam <- NormalizeData(mam, normalization.method = "LogNormalize", scale.factor = 10000)
In order to assign a CC phase to our cells, that come from mouse, I needed to create a list of putative genes suitable for this animal.
convertHumanGeneList <- function(x){
require("biomaRt")
human = useMart(biomart="ensembl", dataset = "hsapiens_gene_ensembl",
verbose = TRUE, host = "https://dec2021.archive.ensembl.org")
mouse = useMart(biomart="ensembl", dataset = "mmusculus_gene_ensembl",
verbose = TRUE, host = "https://dec2021.archive.ensembl.org")
genes = getLDS(attributes = c("hgnc_symbol"),
filters = "hgnc_symbol", values = x ,
mart = human, attributesL = c("mgi_symbol"),
martL = mouse, uniqueRows=T)
humanx <- unique(genes[, 2])
return(humanx)
}
m.s.genes <- convertHumanGeneList(cc.genes.updated.2019$s.genes)
## V1
## 1 0.7
## BioMartServer running BioMart version: 0.7
## Mart virtual schema: default
## Mart host: https://dec2021.archive.ensembl.org:443/biomart/martservice
## V1
## 1 0.7
## BioMartServer running BioMart version: 0.7
## Mart virtual schema: default
## Mart host: https://dec2021.archive.ensembl.org:443/biomart/martservice
m.g2m.genes <- convertHumanGeneList(cc.genes.updated.2019$g2m.genes)
## V1
## 1 0.7
## BioMartServer running BioMart version: 0.7
## Mart virtual schema: default
## Mart host: https://dec2021.archive.ensembl.org:443/biomart/martservice
## V1
## 1 0.7
## BioMartServer running BioMart version: 0.7
## Mart virtual schema: default
## Mart host: https://dec2021.archive.ensembl.org:443/biomart/martservice
mam <- CellCycleScoring(mam, s.features = m.s.genes, g2m.features = m.g2m.genes, set.ident = TRUE)
I create a subset, keeping only the 2000 most variable genes.
mam <- FindVariableFeatures(mam, selection.method = "vst", nfeatures = 2000)
# top 10 most variable genes:
top10 <- head(VariableFeatures(mam), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(mam)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 46 rows containing missing values (geom_point).
I shift the expression of each gene so that the mean is 0 and the variance is 1
all.genes <- rownames(mam)
mam <- ScaleData(mam, features = all.genes)
## Centering and scaling data matrix
I perform PCA on the 2000 most variable genes
mam <- RunPCA(mam, features = VariableFeatures(object = mam))
## PC_ 1
## Positive: Sparc, Serpinh1, Ifitm3, Ctsl, Aebp1, Plpp3, Mt1, Col1a2, Cd63-ps, Gsn
## Mt2, Dcn, Fstl1, Ifitm2, Igfbp7, Col1a1, Rnase4, Serping1, Cd63, Mmp2
## C1s1, Serpinf1, Col6a1, Clec3b, Serpinb6a, Nbl1, Gfpt2, C1ra, Ptrf, Rarres2
## Negative: Cd52, Ccr7, Ighm, Cd79a, Ctss, Cd74, H2-Eb1, H2-Aa, Tmsb4x, Trbc2
## Igkc, H2-Ab1, H2-DMa, Iglc2, Cd83, Cd79b, Hmgb2, Ms4a1, Iglc3, Mzb1
## Trbc1, Epsti1, Traf1, Cd8b1, Cd28, Rgs2, Sh2d2a, Samsn1, Ly6d, Ly86
## PC_ 2
## Positive: Tm4sf1, Tinagl1, F11r, Adgrg1, Fxyd3, Bcam, Hspb1, Cdc42ep3, Cav2, Flt1
## Crip2, Epcam, Cdh5, Itga6, Rasip1, Esam, Sfn, Hbegf, Tuba1c, Lcn2
## RP23-310J6.1, Krt8, Map2k3, Jup, Sox9, RP23-54F9.1, Cd36, Serpinb5, Efhd2, Tjp2
## Negative: Clec3b, Col1a1, Col1a2, Dpt, Dpep1, Igfbp6, Mfap5, Serpinf1, Lum, Col3a1
## Efemp1, Dcn, Ifi205, Rarres2, Pcolce2, Loxl1, Tnfaip6, Fn1, Serping1, Htra3
## Col6a2, Rnase4, Has1, Islr, Col6a1, Fxyd1, Tnxb, Gfpt2, Mmp2, Ogn
## PC_ 3
## Positive: Cdh5, Flt1, Rasip1, Esam, Cd36, Fabp4, Gpihbp1, Cldn5, Adgrf5, Emcn
## Apold1, Pecam1, Grrp1, Cd93, Adgrl4, Iigp1, Mmrn2, Eng, Mgll, Ptprb
## Egfl7, Aqp1, C1qtnf9, Cd300lg, Jam2, Id1, Slfn5, Ctla2a, Sdpr, Kdr
## Negative: Fxyd3, Sfn, Epcam, Sox9, Lgals7, Serpinb5, Krt8, Lcn2, Perp, Tfap2a
## Sdc4, Cdh1, Clca3a2, Gjb4, Krt17, Krt14, Krt5, Fermt1, Aqp3, Cnn1
## Lypd3, Pmepa1, Cxcl14, Krt18, Plet1, Gjb3, Cdcp1, Acta2, Serinc2, Sox10
## PC_ 4
## Positive: Prlr, Fam25c, Cldn4, Krt7, Tmem56, Cxcl15, Smim22, Areg, Pgr, Acot1
## Slc12a2, Cited1, Dkkl1, Krt19, Tspan1, Mlph, Gipc2, Snhg11, Aqp5, Wfdc2
## Wnt7b, Rab25, Cdo1, Basp1, Fxyd2, Cldn3, Cldn7, Lgals3, Slc5a5, F3
## Negative: Krt5, Krt14, Krt17, Acta2, Cnn1, Cxcl14, Tpm2, Myh11, Slpi, Palld
## Cda, Tagln, Col9a2, Lgals7, Myl9, Fermt1, Col17a1, Id4, Ltbp2, Mylk
## Moxd1, Gjb4, Rbp1, RP23-372C7.4, Fst, Chil1, Actg2, Gjb3, AC159092.1, Wnt10a
## PC_ 5
## Positive: Prlr, Trbc2, Fam25c, Cxcl15, Tmem56, Cited1, Snhg11, Krt7, Tspan1, Cldn4
## Wfdc2, Prss22, Gipc2, Smim22, Pgr, Cdo1, Acot1, Krt19, Slc12a2, Rab25
## Stc2, Areg, Ptn, Rnf208, Wnt7b, Slc5a5, Wnt4, Cldn3, Mtmr7, Ly6c1
## Negative: Fcer1g, Tyrobp, Aif1, C1qb, C1qc, Cd68, C1qa, Lyz2, Csf1r, Lilr4b
## Ptafr, Mpeg1, Ccl9, Lrrc25, Cd14, Il1b, Bcl2a1a, Lst1, Cxcl16, Spi1
## Cd300a, Ms4a7, Mrc1, Ccl6, Dnase1l3, Alox5ap, Clec7a, Adgre1, Clec4a2, Bcl2a1d
print(mam[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: Sparc, Serpinh1, Ifitm3, Ctsl, Aebp1
## Negative: Cd52, Ccr7, Ighm, Cd79a, Ctss
## PC_ 2
## Positive: Tm4sf1, Tinagl1, F11r, Adgrg1, Fxyd3
## Negative: Clec3b, Col1a1, Col1a2, Dpt, Dpep1
## PC_ 3
## Positive: Cdh5, Flt1, Rasip1, Esam, Cd36
## Negative: Fxyd3, Sfn, Epcam, Sox9, Lgals7
## PC_ 4
## Positive: Prlr, Fam25c, Cldn4, Krt7, Tmem56
## Negative: Krt5, Krt14, Krt17, Acta2, Cnn1
## PC_ 5
## Positive: Prlr, Trbc2, Fam25c, Cxcl15, Tmem56
## Negative: Fcer1g, Tyrobp, Aif1, C1qb, C1qc
# visualize the most variable genes
VizDimLoadings(mam, dims = 1:2, reduction = "pca")
VizDimLoadings(mam, dims = 3:4, reduction = "pca")
VizDimLoadings(mam, dims = 4:5, reduction = "pca")
Then I visualize the projection of the cells in the first two PCs: (Cells are colored according to cell cycle phase)
DimPlot(mam, reduction = "pca")
The cell cycle seems to be ininfluent!
ElbowPlot(mam, ndims= 40)
Choosing how many dimensions to use can vary depending on the method we choose. In general it’s better to keep all PC until 70/75% of the variance is explained
Rule of the thumb:
pc.touse <- (mam$pca@stdev)^2
pc.touse <- pc.touse/sum(pc.touse)
pc.touse <- cumsum(pc.touse)[1:50]
pc.touse <- min(which(pc.touse>=0.85))
pc.touse
## [1] 22
From this the principal components that retain 75-80% of the variance are 22. I chose to do the analysis for 22 PCs and 15 PCs.
The first step uses the FindNeighbors function, which constructs a KNN graph based on the euclidean distance in PCA space and refines the edge weights using the Jaccard similarity
mam15 <- FindNeighbors(mam, dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
To cluster the cell we use the FindClusters function, which uses the Louvain algorithm to iteratively group cells together
mam15 <- FindClusters(mam15, resolution = 0.3)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3134
## Number of edges: 111730
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9337
## Number of communities: 9
## Elapsed time: 0 seconds
# 9 clusters
I plot the clusters using T Stochastic Neighbor Embedding
mam15.tsne <- RunTSNE(mam15, dims=1:15)
DimPlot(mam15.tsne, reduction = "tsne")
Uniform Manifold Approximation and Projection is generally preferred but requires the installation of a new package
mam15.UMAP <- RunUMAP(mam15, dims = 1:15)
DimPlot(mam15.UMAP, reduction = "umap")
We can also check whether some of the critial quality paramteres influenced the clustering we got
VlnPlot(mam15.UMAP,features="nCount_RNA")
VlnPlot(mam15.UMAP,features="nFeature_RNA")
VlnPlot(mam15.UMAP,features="percent.mt")
VlnPlot(mam15.UMAP,features="percent.rbp")
or the cell cycle
library(ggplot2)
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
mam15@meta.data %>%
group_by(seurat_clusters,Phase) %>%
count() %>%
group_by(seurat_clusters) %>%
mutate(percent=100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=Phase)) +
geom_col() +
ggtitle("Percentage of cell cycle phases per cluster")
For all the other quality check the clustering seems reasonable.
mam22 <- FindNeighbors(mam, dims = 1:22)
## Computing nearest neighbor graph
## Computing SNN
To cluster the cell we use the FindClusters function.
mam22 <- FindClusters(mam22, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3134
## Number of edges: 117528
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8967
## Number of communities: 12
## Elapsed time: 0 seconds
# 12 clusters
I plot the clusters using T Stochastic Neighbor Embedding
mam22.tsne <- RunTSNE(mam22, dims=1:22)
DimPlot(mam22.tsne, reduction = "tsne")
Uniform Manifold Approximation and Projection is generally preferred but requires the installation of a new package
mam22.UMAP <- RunUMAP(mam22, dims = 1:22)
DimPlot(mam22.UMAP, reduction = "umap")
We can also check whether some of the critical quality parameters influenced the clustering we got
VlnPlot(mam22.UMAP,features="nCount_RNA")
VlnPlot(mam22.UMAP,features="nFeature_RNA")
VlnPlot(mam22.UMAP,features="percent.mt")
VlnPlot(mam22.UMAP,features="percent.rbp")
or the cell cycle
mam22@meta.data %>%
group_by(seurat_clusters,Phase) %>%
count() %>%
group_by(seurat_clusters) %>%
mutate(percent=100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=Phase)) +
geom_col() +
ggtitle("Percentage of cell cycle phases per cluster")
Maybe the quantity of mt RNA in cluster 4 could make the identification of marker genes difficult. The quality check is good also with 22 PCs.
Seurat also includes a function that can be used to find genes over expressed between two clusters or overexpressed in one cluster with respect to all the others
The one vs all clusters analyses can be iterated automatically, and we can output the top n (in this case 5) genes for each cluster.
Notice that here they are sorted by logFC - more informative than “p_val_adj”, since a lot of genes will have a FDR close to zero with smallest changes:
mam15 <- mam15.UMAP
mam15.markers <- FindAllMarkers(mam15, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
mam15.markers %>%
group_by(cluster) %>%
slice_max(n = 5, order_by = avg_log2FC)
## # A tibble: 45 × 7
## # Groups: cluster [9]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0 2.49 0.945 0.216 0 0 Trbc2
## 2 0 2.31 0.956 0.17 0 0 Cd3d
## 3 0 2.18 0.746 0.049 0 0 Lef1
## 4 6.70e-180 2.11 0.589 0.123 1.46e-175 0 Trbc1
## 5 0 2.09 0.835 0.122 0 0 Trac
## 6 0 4.91 0.982 0.459 0 1 Igkc
## 7 0 4.02 0.966 0.056 0 1 Cd79a
## 8 0 3.73 0.998 0.684 0 1 Cd74
## 9 0 3.42 0.949 0.104 0 1 H2-DMb2
## 10 0 3.33 0.993 0.413 0 1 H2-Aa
## # … with 35 more rows
mam15.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(mam15, features = top10$gene) + NoLegend()
Looking at the Heatmap we can see that clusters 0 and 4 show quite some of similarities. Also clusters 5, 7 and 8 are similar.
In real life, having two clusters with “too similar” patterns of DE/marker genes might be a problem. Can we conclude that after all they are the same cell type, partitioned wrongly into two separate clusters, or they are indeed two different cell types or subtypes, with a few genes “making the difference” from one another?
We want to asses the cell type shared by cluster 0 and 4.
c0and4.markers <- FindMarkers(mam15, ident.1 = c(0,4), min.pct = 0.25, test.use = "wilcox")
c0and4.markers <- c0and4.markers[order(-c0and4.markers$avg_log2FC), ]
head(c0and4.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Trbc2 0.000000e+00 3.237016 0.941 0.152 0.000000e+00
## Cd3d 0.000000e+00 2.851358 0.944 0.104 0.000000e+00
## Cd3g 0.000000e+00 2.745328 0.864 0.062 0.000000e+00
## Trbc1 1.110342e-235 2.639035 0.592 0.079 2.424877e-231
## Trac 0.000000e+00 2.581300 0.817 0.068 0.000000e+00
## Emb 0.000000e+00 2.411268 0.791 0.155 0.000000e+00
## Ms4a4b 0.000000e+00 2.195321 0.680 0.035 0.000000e+00
## Cd3e 0.000000e+00 2.092902 0.743 0.052 0.000000e+00
## Ramp3 5.159768e-120 2.043142 0.426 0.094 1.126842e-115
## Lef1 0.000000e+00 2.041274 0.656 0.038 0.000000e+00
## Lat 0.000000e+00 1.977822 0.736 0.068 0.000000e+00
## Ms4a6b 1.727429e-285 1.860232 0.688 0.088 3.772531e-281
## Cd8b1 2.098191e-163 1.853494 0.430 0.047 4.582239e-159
## Lck 0.000000e+00 1.833052 0.743 0.096 0.000000e+00
## Saraf 3.308645e-256 1.807223 0.884 0.419 7.225751e-252
## Rapgef6 7.997470e-194 1.797912 0.722 0.246 1.746567e-189
## Tcf7 6.236144e-289 1.765309 0.652 0.065 1.361912e-284
## Nkg7 1.619036e-151 1.740390 0.381 0.031 3.535814e-147
## Hcst 3.731547e-283 1.738405 0.773 0.146 8.149326e-279
## Dapl1 4.227755e-142 1.709802 0.335 0.018 9.232995e-138
Cd3d, Cd3g, Ms4a4b, Trbc2 and 1 (t cell receptor beta), Trac (T cell receptor alfa) - Tcell It seems that they are all T cells
Let’s see the genes making the difference between those two clusters:
Genes overexpressed in cluster 0 vs cluster 4
c0vs4.markers <- FindMarkers(mam15, ident.1 = 0, ident.2 = 4, min.pct = 0.25, test.use = "wilcox")
c0vs4.markers <- c0vs4.markers[order(-c0vs4.markers$avg_log2FC),]
head(c0vs4.markers, n = 30)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Lef1 4.935662e-46 1.8435953 0.746 0.171 1.077899e-41
## Dapl1 2.473530e-18 1.6084840 0.389 0.050 5.401942e-14
## Cd8b1 3.957465e-18 1.4953134 0.482 0.149 8.642708e-14
## Igfbp4 9.256773e-21 1.4814241 0.451 0.088 2.021587e-16
## Rapgef6 5.134435e-27 1.4493804 0.778 0.420 1.121309e-22
## Klf2 5.828091e-28 1.3808189 0.875 0.514 1.272797e-23
## Actn1 1.169199e-17 0.9980133 0.442 0.105 2.553413e-13
## Pik3ip1 2.754335e-14 0.9336694 0.394 0.110 6.015193e-10
## RP23-32C18.6 7.361641e-14 0.8791695 0.624 0.343 1.607709e-09
## Hspa1a 5.474583e-07 0.8472905 0.460 0.276 1.195594e-02
## Npc2 1.383523e-23 0.8169287 0.905 0.729 3.021476e-19
## RP24-146B4.2 1.635966e-12 0.8169043 0.741 0.475 3.572787e-08
## Ms4a6b 6.469028e-15 0.7902985 0.733 0.448 1.412771e-10
## Rps19 3.712274e-62 0.7697729 0.994 1.000 8.107236e-58
## Ablim1 4.186792e-11 0.7186184 0.629 0.387 9.143535e-07
## Cd8a 4.002654e-12 0.7166578 0.336 0.077 8.741397e-08
## Satb1 3.713756e-11 0.7122203 0.828 0.630 8.110472e-07
## RP23-460F17.1 9.210091e-37 0.6737760 0.986 0.967 2.011392e-32
## Rpl12 7.031175e-50 0.6635674 0.993 1.000 1.535538e-45
## Rps17 3.947776e-44 0.6505499 0.992 0.989 8.621548e-40
## RP23-426M5.1 1.179923e-14 0.6383055 0.867 0.724 2.576835e-10
## Rpl5 5.528782e-40 0.6376929 0.989 0.978 1.207431e-35
## AC110186.1 5.286241e-16 0.6280161 0.902 0.768 1.154462e-11
## RP23-146N20.3 2.277385e-09 0.6267107 0.495 0.282 4.973581e-05
## Gimap6 1.044335e-08 0.6079117 0.731 0.547 2.280724e-04
## Rpl36a-ps2 7.940824e-20 0.6048760 0.959 0.895 1.734197e-15
## Rpl35a 1.036344e-56 0.6039809 0.996 1.000 2.263272e-52
## Stambpl1 5.554188e-05 0.5997311 0.323 0.177 1.000000e+00
## Rplp0 4.189802e-48 0.5988264 0.996 0.994 9.150108e-44
## RP23-336M12.2 1.756165e-18 0.5972720 0.939 0.867 3.835289e-14
Dapl1 - death associated protein 1 - T cells activates CD8
Lef1 - lymphoid enhancer factor (regulates T cell receptor) - CD8 T cells
Cd8b1 - CD8 antigen, beta chain 1 - CD8 T cells
Rapgef6 - Rap guanine nucleotide exchange factor (GEF) 6 transmission signal - CD8 T cells
Cluster 0: Cytotoxic T CELLS
And overexpressed in cluster 4 vs cluster 0
c4vs0.markers <- FindMarkers(mam15, ident.1 = 4, ident.2 = 0, min.pct = 0.25, test.use = "wilcox")
c4vs0.markers <- c4vs0.markers[order(-c4vs0.markers$avg_log2FC),]
head(c4vs0.markers, n = 40)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Ccl5 9.178670e-17 4.019912 0.475 0.241 2.004530e-12
## Tnfrsf4 1.297757e-43 3.204517 0.431 0.073 2.834171e-39
## Tnfrsf9 5.663093e-75 2.728546 0.453 0.027 1.236763e-70
## S100a6 7.763668e-54 2.702791 0.779 0.319 1.695507e-49
## Odc1 8.694749e-41 2.330583 0.691 0.267 1.898846e-36
## Lgals1 8.657629e-47 2.308617 0.779 0.344 1.890740e-42
## Bcl2a1b 2.809613e-75 2.293395 0.652 0.100 6.135915e-71
## Xcl1 6.719753e-32 2.255040 0.260 0.029 1.467527e-27
## Ctla4 2.009969e-29 2.217114 0.343 0.067 4.389570e-25
## Lgals3 1.778136e-18 2.205171 0.508 0.244 3.883270e-14
## Rgs1 1.583210e-57 2.179747 0.680 0.162 3.457573e-53
## Rora 7.718335e-73 2.173113 0.602 0.085 1.685607e-68
## Il2rb 5.450829e-88 2.162441 0.862 0.206 1.190407e-83
## Icos 3.311547e-41 2.155042 0.674 0.236 7.232087e-37
## Cxcr6 1.156520e-33 2.135533 0.414 0.096 2.525723e-29
## Fcer1g 2.241468e-33 2.129084 0.254 0.025 4.895141e-29
## Bcl2a1d 5.460339e-91 2.119327 0.552 0.036 1.192483e-86
## S100a11 5.471073e-54 1.871773 0.867 0.378 1.194828e-49
## Id2 2.002882e-31 1.839337 0.762 0.404 4.374094e-27
## Rgs2 2.459777e-43 1.838875 0.818 0.384 5.371908e-39
## S100a4 4.182001e-87 1.830910 0.448 0.012 9.133073e-83
## Crip1 6.623109e-22 1.825804 0.856 0.636 1.446421e-17
## Dgat1 6.392839e-38 1.790821 0.586 0.180 1.396132e-33
## Ikzf2 1.610807e-65 1.782292 0.530 0.064 3.517842e-61
## Eprs 1.714339e-26 1.686793 0.497 0.179 3.743944e-22
## Gata3 1.280414e-29 1.685198 0.448 0.119 2.796297e-25
## Ramp3 1.892877e-11 1.654939 0.580 0.397 4.133853e-07
## Capg 1.788797e-52 1.648506 0.586 0.125 3.906554e-48
## Anxa2 4.719188e-48 1.626626 0.564 0.122 1.030623e-43
## Stx11 1.225240e-61 1.610300 0.398 0.027 2.675802e-57
## Ccr2 3.008786e-55 1.563099 0.304 0.011 6.570889e-51
## Ikzf3 2.182000e-48 1.555595 0.436 0.064 4.765270e-44
## Cd74 2.278393e-06 1.538211 0.729 0.621 4.975782e-02
## Cstb 7.818135e-18 1.532447 0.586 0.307 1.707403e-13
## Tnfrsf18 1.963210e-27 1.531602 0.657 0.313 4.287454e-23
## Lmna 7.664273e-34 1.498586 0.608 0.212 1.673801e-29
## Crem 1.039997e-29 1.490932 0.851 0.522 2.271249e-25
## H2afz 1.502953e-27 1.474332 0.729 0.412 3.282299e-23
## Cd7 3.707604e-17 1.469328 0.365 0.134 8.097036e-13
## Lmo4 5.000408e-11 1.456491 0.265 0.103 1.092039e-06
Ccl5 - chemokine ligand - all the cells (overexpressed in activated T)
Tnfrsf4 and 9 - TNF receptor superfamily (citokyne receptor) - 4 overexpressed in T cells
Il2rb - expressed mainly in T reg cells
Icos - inducible T cell co-stimulator or Il2rb interleukin 2 receptor for T reg
Ctla4 - citotoxic T cell
Cluster 4: Regulatory T CELLS
c4.markers <- FindMarkers(mam15, ident.1 = 4, min.pct = 0.25, test.use = "wilcox")
c4.markers<- c4.markers[order(-c4.markers$avg_log2FC),]
head(c4.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Tnfrsf4 1.688780e-82 3.386692 0.431 0.055 3.688127e-78
## Ccl5 2.410088e-18 2.963408 0.475 0.252 5.263391e-14
## Tnfrsf9 1.168141e-148 2.757913 0.453 0.026 2.551104e-144
## Icos 3.534140e-129 2.642139 0.674 0.091 7.718209e-125
## Il2rb 2.950531e-220 2.551248 0.862 0.087 6.443664e-216
## Ramp3 5.388275e-45 2.526703 0.580 0.194 1.176745e-40
## Ctla4 1.524957e-77 2.390548 0.343 0.034 3.330353e-73
## Xcl1 1.796225e-91 2.332211 0.260 0.012 3.922775e-87
## Cxcr6 1.212125e-97 2.296950 0.414 0.040 2.647160e-93
## Odc1 1.549042e-38 2.194837 0.691 0.334 3.382953e-34
## Rgs1 6.155512e-97 2.117851 0.680 0.124 1.344302e-92
## Rora 8.504234e-82 2.085887 0.602 0.124 1.857240e-77
## Nkg7 1.589384e-26 2.068852 0.414 0.144 3.471056e-22
## Tnfrsf18 5.158250e-81 2.057356 0.657 0.147 1.126510e-76
## Dgat1 2.401817e-71 1.963953 0.586 0.128 5.245329e-67
## Ikzf2 7.169379e-160 1.909731 0.530 0.035 1.565721e-155
## Id2 7.789812e-47 1.882657 0.762 0.328 1.701217e-42
## Rgs2 8.197631e-55 1.774368 0.818 0.343 1.790281e-50
## Gata3 1.122760e-48 1.746248 0.448 0.099 2.451995e-44
## Bcl2a1b 3.917433e-68 1.746028 0.652 0.154 8.555282e-64
Cluster 5, 7 and 8
c5and6and7.markers <- FindMarkers(mam15, ident.1 = c(5,6,7), min.pct = 0.25, test.use = "wilcox")
c5and6and7.markers <- c5and6and7.markers[order(-c5and6and7.markers$avg_log2FC), ]
head(c5and6and7.markers, n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Krt14 2.577047e-80 4.148111 0.426 0.092 5.628013e-76
## Lyz2 2.311256e-98 4.124770 0.372 0.049 5.047553e-94
## Krt17 7.788089e-81 4.013333 0.435 0.093 1.700841e-76
## Plet1 1.075669e-83 3.932679 0.599 0.180 2.349153e-79
## Lcn2 1.517368e-123 3.619132 0.645 0.158 3.313779e-119
## Slpi 1.455057e-104 3.562847 0.457 0.076 3.177698e-100
## Sfn 4.343352e-77 3.483441 0.648 0.253 9.485447e-73
## Acta2 1.805194e-80 3.408883 0.389 0.073 3.942364e-76
## Cxcl14 3.513107e-72 3.119368 0.386 0.082 7.672274e-68
## Epcam 3.968868e-122 2.987532 0.625 0.142 8.667610e-118
Krt14 and Krt17 basal cells, Lyz2 macrophages, Plet1 al biological processes? - basal cells luminal epitelial, Slpi basal cells, Sfn basal and luminal epitelial, Acta2 and Cxcl14 basal
seem to be basal cell, but I need to further assess the gene expression
Cluster 5
c5vs78.markers <- FindMarkers(mam15, ident.1 = 5, ident.2 = c(7,8), min.pct = 0.25, test.use = "wilcox")
c5vs78.markers <- c5vs78.markers[order(-c5vs78.markers$avg_log2FC),]
head(c5vs78.markers , n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Lyz2 2.593171e-29 5.268751 0.759 0.181 5.663226e-25
## Ccl4 7.953952e-11 4.415573 0.447 0.141 1.737064e-06
## H2-Ab1 7.664238e-23 3.764166 0.844 0.752 1.673793e-18
## Apoe 7.583332e-07 3.628451 0.723 0.671 1.656124e-02
## C1qb 4.368295e-13 3.518512 0.404 0.067 9.539919e-09
## Ccl7 9.815155e-03 3.438505 0.426 0.376 1.000000e+00
## C1qa 4.658022e-13 3.436493 0.397 0.060 1.017265e-08
## Cxcl2 2.153981e-04 3.408237 0.433 0.302 1.000000e+00
## Il1b 3.461477e-12 3.282038 0.447 0.107 7.559519e-08
## Tyrobp 1.627813e-28 3.064122 0.872 0.315 3.554980e-24
Lyz2, Ccl4, C1qb - ONLY expressed in macrophages
H2-Ab1 - b cells and macrophages
Cluster 5 vs ALL: Lyz2 -> produces Lyzozyme which has bacteriolytic function, contained in macrophages
Ccl4, IL-1β expressed in a wide range of cells, especially in macrophages
MACROPHAGES
Cluster 7
c7vs8.markers <- FindMarkers(mam15, ident.1 = 7, ident.2 = 8, min.pct = 0.25, test.use = "wilcox")
c7vs8.markers <- c7vs8.markers[order(-c7vs8.markers$avg_log2FC),]
head(c7vs8.markers , n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Ccl5 7.896022e-03 5.651256 0.579 0.556 1.000000e+00
## Lcn2 9.588262e-12 4.104435 0.768 0.333 2.093981e-07
## Wfdc18 2.419885e-10 4.095119 0.653 0.167 5.284788e-06
## Csn3 1.660745e-09 4.071832 0.600 0.130 3.626901e-05
## Cst3 2.644240e-18 4.066987 0.947 0.648 5.774756e-14
## Epsti1 2.175447e-05 3.971046 0.411 0.111 4.750958e-01
## Plet1 1.667773e-06 3.579314 0.821 0.741 3.642250e-02
## Vim 9.131721e-04 3.379894 0.526 0.352 1.000000e+00
## Aldh1a3 4.932237e-10 3.236404 0.589 0.074 1.077151e-05
## Lsp1 3.269450e-03 3.162157 0.400 0.241 1.000000e+00
Csn3 - luminal epithelial (caseina) - milk gene
Lcn2 lipocalin 2 - luminal progenitor
Plet1 - luminal cells
Wfdc18 - WAP four-disulfide core domain 18 - milk gene in the luminal progenitor
Aldh1a3 - progenitor luminal marker (from article)
Cluster 7 vs ALL: Csn3 produces caseine in luminal epithelial cells, Wfdc18 WAP is produced selectively by committed luminal cells within mammary ducts and alveoli
LUMINAL EPITHELIAL PROGENITOR
Cluster 8
c8vs7.markers <- FindMarkers(mam15, ident.1 = 8, ident.2 = 7, min.pct = 0.25, test.use = "wilcox")
c8vs7.markers<- c8vs7.markers[order(-c8vs7.markers$avg_log2FC),]
head(c8vs7.markers , n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Ptn 2.150207e-26 5.127102 0.981 0.211 4.695838e-22
## Fam25c 4.283342e-28 4.152706 0.963 0.063 9.354391e-24
## Areg 3.516379e-21 4.119608 0.889 0.211 7.679420e-17
## Gpx3 1.417817e-25 4.072971 0.944 0.126 3.096370e-21
## Wfdc2 1.562589e-25 3.786876 0.981 0.211 3.412539e-21
## Ly6a 2.395399e-23 3.534809 0.963 0.295 5.231312e-19
## Fxyd2 1.196481e-10 3.485120 0.796 0.358 2.612996e-06
## Nupr1 1.409345e-18 3.403162 0.963 0.579 3.077868e-14
## Prlr 3.688473e-27 3.357316 0.963 0.116 8.055256e-23
## Slc12a2 5.449938e-20 3.129085 0.852 0.179 1.190212e-15
## Ly6d 5.553404e-19 3.072081 0.981 0.589 1.212808e-14
## Cd164 2.357259e-22 3.016621 0.963 0.505 5.148019e-18
## Tgm2 5.827858e-21 3.013323 0.870 0.179 1.272746e-16
## Glul 1.024538e-22 2.965690 1.000 0.484 2.237488e-18
## Cited1 3.918453e-20 2.857229 0.833 0.126 8.557510e-16
## RP23-396L12.1 2.262669e-13 2.852974 0.519 0.021 4.941442e-09
## F3 1.177899e-16 2.585397 0.944 0.547 2.572413e-12
## Gadd45g 2.268068e-16 2.509278 0.907 0.400 4.953234e-12
## Mt1 5.901599e-15 2.504591 0.981 0.916 1.288850e-10
## Acot1 1.011265e-20 2.248728 0.833 0.116 2.208502e-16
Ptn - pleiotropin, expressed in epithelial cells of the mammary gland
Amphiregulin (AREG), a ligand for epidermal growth factor receptor, is required for mammary gland ductal morphogenesis - found in mature luminal
Ly6a marker for mature luminal cells
Prlr - prolactine receptor
cluster8.markers <- FindMarkers(mam15, ident.1 = 8, min.pct = 0.25, test.use = "wilcox")
cluster8.markers<- cluster8.markers[order(-cluster8.markers$avg_log2FC),]
head(cluster8.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Ptn 1.638583e-88 4.966034 0.981 0.133 3.578500e-84
## Krt19 2.771448e-57 4.664426 1.000 0.257 6.052566e-53
## Krt18 1.040501e-54 4.199693 1.000 0.264 2.272349e-50
## Areg 2.507864e-99 4.188402 0.889 0.085 5.476924e-95
## Fam25c 6.338078e-236 4.144952 0.963 0.031 1.384173e-231
## Fxyd2 9.561550e-85 4.089939 0.796 0.073 2.088147e-80
## Krt8 1.632043e-67 3.908918 1.000 0.190 3.564218e-63
## Nupr1 4.767968e-50 3.747800 0.963 0.244 1.041277e-45
## Wfdc2 2.931051e-109 3.746610 0.981 0.097 6.401123e-105
## F3 1.189587e-83 3.729408 0.944 0.123 2.597939e-79
## Prlr 5.083734e-242 3.419781 0.963 0.030 1.110237e-237
## Slc12a2 4.102617e-129 3.262834 0.852 0.054 8.959706e-125
## Gpx3 1.024273e-61 3.212295 0.944 0.163 2.236910e-57
## Ly6d 4.641992e-36 3.185266 0.981 0.403 1.013765e-31
## Krt7 7.100285e-152 3.132990 1.000 0.065 1.550631e-147
## S100a6 1.686300e-34 3.087429 1.000 0.515 3.682710e-30
## Cldn4 4.966306e-154 3.029716 0.889 0.046 1.084592e-149
## Cited1 1.144805e-175 2.934301 0.833 0.033 2.500140e-171
## RP23-396L12.1 5.229750e-131 2.821598 0.519 0.015 1.142125e-126
## Bhlhe41 6.679118e-84 2.817685 0.852 0.086 1.458653e-79
Areg, Prlr prolactin receptor
Pgr progesterone receptor - Mature luminal (from tabula muris article)
MATURE LUMINAL
Cluster 1 vs all
cluster1.markers <- FindMarkers(mam15, ident.1 = 1, min.pct = 0.25, test.use = "wilcox")
cluster1.markers<- cluster1.markers[order(-cluster1.markers$avg_log2FC),]
head(cluster1.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Igkc 0.000000e+00 4.909842 0.982 0.459 0.000000e+00
## Cd79a 0.000000e+00 4.018468 0.966 0.056 0.000000e+00
## Cd74 0.000000e+00 3.730391 0.998 0.684 0.000000e+00
## H2-DMb2 0.000000e+00 3.419685 0.949 0.104 0.000000e+00
## H2-Aa 0.000000e+00 3.325257 0.993 0.413 0.000000e+00
## H2-Ab1 0.000000e+00 3.291506 0.991 0.485 0.000000e+00
## Iglc2 0.000000e+00 3.149693 0.801 0.093 0.000000e+00
## Ighm 0.000000e+00 3.062614 0.959 0.371 0.000000e+00
## H2-Eb1 0.000000e+00 3.016879 0.991 0.388 0.000000e+00
## Ebf1 0.000000e+00 2.845617 0.981 0.257 0.000000e+00
## Cd79b 0.000000e+00 2.634404 0.721 0.081 0.000000e+00
## Ighd 0.000000e+00 2.563363 0.765 0.058 0.000000e+00
## Fcmr 0.000000e+00 2.484154 0.694 0.049 0.000000e+00
## Mef2c 0.000000e+00 2.462174 0.763 0.103 0.000000e+00
## Ms4a1 0.000000e+00 2.457968 0.638 0.028 0.000000e+00
## Cd83 2.117549e-278 2.412450 0.758 0.149 4.624516e-274
## Iglc3 1.163791e-293 2.335728 0.623 0.047 2.541603e-289
## H2-Ob 0.000000e+00 2.261084 0.761 0.090 0.000000e+00
## Scd1 1.651398e-305 2.216370 0.752 0.127 3.606487e-301
## Cd37 3.900746e-273 2.106576 0.883 0.373 8.518840e-269
Igkc immunoglobulin kappa constant - B cells
Cd74 - B cells and macrophages
Cd79a - B cells creates Ig alpha
Cd79b - B cells creates Ig beta
H2-.. - histocompatibility complex 2, found on B cells
cluster2.markers <- FindMarkers(mam15, ident.1 = 2, min.pct = 0.25, test.use = "wilcox")
cluster2.markers<- cluster2.markers[order(-cluster2.markers$avg_log2FC),]
head(cluster2.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Dcn 2.011460e-306 5.964926 0.965 0.220 4.392827e-302
## Gsn 1.501882e-252 5.337764 0.998 0.448 3.279960e-248
## Tnfaip6 0.000000e+00 5.148355 0.940 0.164 0.000000e+00
## Col3a1 0.000000e+00 4.929044 0.878 0.075 0.000000e+00
## Col1a2 0.000000e+00 4.539514 0.940 0.038 0.000000e+00
## Col1a1 0.000000e+00 4.433464 0.918 0.026 0.000000e+00
## Clec3b 0.000000e+00 4.383540 0.876 0.056 0.000000e+00
## Ccl2 4.182943e-137 4.166515 0.707 0.214 9.135129e-133
## Cxcl1 1.130420e-145 4.053782 0.792 0.266 2.468724e-141
## Igfbp6 0.000000e+00 4.008005 0.896 0.107 0.000000e+00
## Plac9c 0.000000e+00 3.978710 0.784 0.075 0.000000e+00
## Lum 0.000000e+00 3.902518 0.881 0.064 0.000000e+00
## Dpt 0.000000e+00 3.727592 0.841 0.048 0.000000e+00
## Fn1 0.000000e+00 3.673571 0.739 0.061 0.000000e+00
## Postn 1.366508e-271 3.635399 0.767 0.089 2.984317e-267
## Serpinf1 0.000000e+00 3.631132 0.888 0.057 0.000000e+00
## Ifi205 0.000000e+00 3.596225 0.849 0.040 0.000000e+00
## Ptx3 8.410029e-161 3.540867 0.618 0.113 1.836666e-156
## Mmp3 4.404092e-203 3.475974 0.566 0.055 9.618097e-199
## Ccl11 0.000000e+00 3.472359 0.784 0.039 0.000000e+00
Dcn decorin (proteoglycan) - Stromal cells
Gsn gelsolina (actin binding protein) - Stromal cells
Tnfaip6 - stromal cells
Col3a1 collagene type III - Stromal cells
Col1 collagen type II - Stromal cells
cluster3.markers <- FindMarkers(mam15, ident.1 = 3, min.pct = 0.25, test.use = "wilcox")
cluster3.markers<- cluster3.markers[order(-cluster3.markers$avg_log2FC),]
head(cluster3.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Fabp4 1.566507e-152 6.390228 0.815 0.225 3.421095e-148
## Cldn5 9.642696e-211 4.474197 0.555 0.035 2.105868e-206
## Aqp1 2.007816e-134 4.215125 0.509 0.062 4.384870e-130
## Id1 5.991903e-134 3.883379 0.566 0.083 1.308572e-129
## Tm4sf1 2.165186e-110 3.724753 0.755 0.247 4.728551e-106
## Cd36 1.526337e-187 3.673419 0.596 0.058 3.333367e-183
## Iigp1 9.138218e-145 3.380963 0.615 0.091 1.995696e-140
## Ly6c1 1.356899e-107 3.191548 0.653 0.162 2.963331e-103
## Gpihbp1 1.873827e-275 3.175703 0.509 0.009 4.092250e-271
## Ctla2a 8.744844e-98 3.135045 0.615 0.154 1.909786e-93
## Flt1 0.000000e+00 2.957174 0.604 0.012 0.000000e+00
## Rasip1 0.000000e+00 2.951594 0.642 0.018 0.000000e+00
## Ifitm3 8.971972e-96 2.919977 0.830 0.436 1.959389e-91
## Isg15 1.026771e-96 2.860699 0.547 0.113 2.242365e-92
## Rnd1 1.006319e-99 2.775785 0.396 0.048 2.197699e-95
## Ier3 5.608218e-80 2.758546 0.675 0.238 1.224779e-75
## Akap12 3.833581e-113 2.750555 0.502 0.075 8.372159e-109
## Emp1 1.013209e-52 2.745602 0.630 0.287 2.212747e-48
## Cdkn1a 1.096357e-87 2.740566 0.717 0.278 2.394335e-83
## Esam 3.950912e-301 2.721911 0.608 0.018 8.628397e-297
Fabp4 fatty acid biding protein - endotelial cells
Cldn5 endothelial thight junctions - endotelial cells
Aqp1 aquaporin 1 - endotelial cells
Cd36 lipid transport - endotelial cells
Esam - endotelial cell specific adhesion protein
Cluster 6
c6vs8.markers <- FindMarkers(mam15, ident.1 = 6, ident.2 = 8, min.pct = 0.25, test.use = "wilcox")
c6vs8.markers <- c6vs8.markers[order(-c6vs8.markers$avg_log2FC),]
head(c6vs8.markers , n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Krt14 1.064588e-25 5.809747 0.991 0.167 2.324954e-21
## Krt17 6.728689e-26 5.591590 1.000 0.296 1.469478e-21
## Acta2 1.855855e-25 5.261605 0.991 0.259 4.053002e-21
## Slpi 9.494052e-25 5.052529 0.983 0.148 2.073406e-20
## Cxcl14 8.813886e-26 4.969043 0.991 0.130 1.924865e-21
## Apoe 1.171922e-25 4.768597 1.000 0.611 2.559360e-21
## Lcn2 3.024460e-25 4.703949 0.991 0.333 6.605118e-21
## Tpm2 4.782964e-26 4.459295 1.000 0.241 1.044551e-21
## Chil1 6.708454e-19 3.814005 0.828 0.056 1.465059e-14
## Fst 7.451984e-23 3.572644 0.922 0.056 1.627439e-18
Krt14 and Krt17 marker for basal cells, also Acta2
BASAL CELLS
Plot the expression of the markers with a heatmap
FeaturePlot(mam15, features = c("Trbc2", "Cd79a", "Dcn", "Fabp4", "Lyz2", "Krt14", "Plet1", "Areg"))
I inspect in depth the subpopulations of T cells:
FeaturePlot(mam15, features = "Il2rb", repel = T)
FeaturePlot(mam15, features = "Cd8b1", repel = T)
Or with a violin plot
VlnPlot(mam15, features = "Cd8b1")
VlnPlot(mam15, features = "Cd79a")
VlnPlot(mam15, features = "Dcn")
VlnPlot(mam15, features = "Fabp4")
VlnPlot(mam15, features = "Il2rb")
VlnPlot(mam15, features = "Lyz2")
VlnPlot(mam15, features = "Krt14")
VlnPlot(mam15, features = "Plet1")
VlnPlot(mam15, features = "Areg")
We can visualize all the marker genes and their expression (CPM) in each cluster using a dot plot.
library(ggrepel)
DotPlot(mam15, features = c("Trbc2", "Cd79a", "Dcn", "Fabp4", "Il2rb", "Lyz2", "Krt14", "Plet1", "Areg")) + theme(axis.text.x = element_text(angle = 90))
Dot plot with Cd8b1 gene marker for cluster 0 instead of Trbc2:
library(ggrepel)
DotPlot(mam15, features = c("Cd8b1", "Cd79a", "Dcn", "Fabp4", "Il2rb", "Lyz2", "Krt14", "Plet1", "Areg")) + theme(axis.text.x = element_text(angle = 90))
Both cluster 0 and 4 are T cells, but it seems that only cluster 0 contains CD8+ T cells.
Using marker genes to infere the subtypes of each cluster was quite difficult and needed a deep knowlegde of the fild (which in our case is limited). Eventually we were able to identify 10 different cells types of which 4 subtypes of ODC, 2 subtypes of AST and Microglia, Neuros, OPC ad Endothelial cells.
Comparing the results with the automated pipeline employed by Panglao, we were able to label and cluster the cells with unknown cell type. Also we did not found Pancreatich stellate cells and Pericytes, may be due to different clustering parameters.
new.cluster.ids <- c("T cytotoxic", "B cells", "Stromal cells", "Endotelial cells", "T reg", "Macrophages", "Basal cells", "Luminal Progenitor ", 'Mature Luminal')
names(new.cluster.ids) <- levels(mam15)
MAM15<- RenameIdents(mam15, new.cluster.ids)
DimPlot(MAM15, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
DimPlot(mam15.tsne, reduction = “tsne”)
names(new.cluster.ids) <- levels(mam15.tsne)
MAM15.TSNE<- RenameIdents(mam15.tsne, new.cluster.ids)
DimPlot(MAM15.TSNE, reduction = "tsne", label = TRUE, pt.size = 0.5) + NoLegend()